import requests
from bs4 import BeautifulSoup
import re


def open_url(url):
    # 修正headers格式
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.raise_for_status()  # 添加状态检查
    return res


def find_movies(res, depth):
    soup = BeautifulSoup(res.text, 'html.parser')
    movies = []
    ranks = []
    messages = []
    introductions = []

    # 遍历每个电影项目
    for item in soup.find_all("div", class_="item"):
        # 电影名
        title = item.find("span", class_="title").text
        movies.append(title)

        # 评分
        rating = item.find("span", class_="rating_num").text
        ranks.append(f'评分:{rating}')

        # 资料
        bd = item.find("div", class_="bd")
        info_text = bd.find("p").get_text(strip=True)
        # 过滤掉英文部分
        info_text = re.sub(r'[a-zA-Z]', '', info_text)
        messages.append(info_text)

        # 简介 (使用更可靠的定位方式)
        quote = item.find("span", class_="inq")
        intro = quote.text if quote else "无"
        introductions.append(intro)

    # 构建结果
    result = []
    for i in range(len(movies)):
        index = depth * 25 + i + 1
        result.append(
            f'第{index}名--- {movies[i]} --- {ranks[i].center(10)} '
            f'--- {messages[i]} --- {introductions[i]}\n'
        )
    return result


def find_depth(res):
    soup = BeautifulSoup(res.text, 'html.parser')
    # 更可靠的分页深度获取方式
    paginator = soup.find("div", class_="paginator")
    last_page = paginator.find_all("a")[-1] if paginator else None
    if last_page:
        # 从最后一页URL提取数字
        match = re.search(r'start=(\d+)', last_page["href"])
        if match:
            start_num = int(match.group(1))
            return (start_num // 25) + 1  # 计算总页数
    return 10  # 默认返回10页


def main():
    host = "https://movie.douban.com/top250"  # 修正URL空格
    res = open_url(host)
    depth = find_depth(res)
    result = []

    # 修正循环缩进
    for index in range(depth):
        url = host + f'?start={25 * index}'  # 修正参数格式
        print(f"爬取页面: {url}")
        res = open_url(url)
        result.extend(find_movies(res, index))

    # 保存结果
    with open("豆瓣TOP250电影.txt", "w", encoding="utf-8") as f:
        for each in result:
            f.write(each)
    print("爬取完成，结果已保存到'豆瓣TOP250电影.txt'")


if __name__ == "__main__":
    main()